The client: The city of San Francisco
The scenario: The city of SF is looking to make use of a data set dating back some 50 years that has never been put to any use and where they have lost the overview. We are a data consultants who were given the data set to create a preliminary exploratory analyis type report as a trail with view to being hired on a longer term project with the linked phrase.
Libraries that are used
#install.packages('dplyr')
library(dplyr)
#install.packages('lubridate')
library(lubridate)
#install.packages('tidyr')
library(tidyr)
#install.packages('ggplot2')
library(ggplot2)
library(tidyverse)
#install.packages('gganimate')
library(gganimate)
#install.packages('gifski')
library(gifski)
#install.packages('av')
library(av)
#install.packages('ggmap')
library(ggmap)
#install.packages('shiny')
library(shiny)
#install.packages('leaflet')
library(leaflet)
df_trees_orig <- read.csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-28/sf_trees.csv')
We create a copy of data to preserve its original form in case we want to look at it for reference.
df_trees <- df_trees_orig
We look at the data structure, and see all the string variables are saved as char. We would rather have them as factors for potential modelling use.
str(df_trees)
## 'data.frame': 192987 obs. of 12 variables:
## $ tree_id : int 53719 30313 30312 30314 30315 30316 48435 30319 30318 30320 ...
## $ legal_status: chr "Permitted Site" "Permitted Site" "Permitted Site" "DPW Maintained" ...
## $ species : chr "Tree(s) ::" "Tree(s) ::" "Tree(s) ::" "Pittosporum undulatum :: Victorian Box" ...
## $ address : chr "2963 Webster St" "501 Arkansas St" "501 Arkansas St" "501 Arkansas St" ...
## $ site_order : int 1 3 2 1 5 6 4 2 1 3 ...
## $ site_info : chr "Sidewalk: Curb side : Cutout" "Sidewalk: Curb side : Cutout" "Sidewalk: Curb side : Cutout" "Sidewalk: Curb side : Cutout" ...
## $ caretaker : chr "Private" "Private" "Private" "Private" ...
## $ date : chr "1955-09-19" "1955-10-20" "1955-10-20" "1955-10-20" ...
## $ dbh : int NA NA NA 16 NA NA NA NA NA NA ...
## $ plot_size : chr NA NA NA NA ...
## $ latitude : num 37.8 37.8 37.8 37.8 37.8 ...
## $ longitude : num -122 -122 -122 -122 -122 ...
Changing columns with type character to type factor.
NOTE: You need double brackets because single bracket indexing returns a df object, which always evaluates to FALSE. Double brackets returns you the column as a vector whose data type can be tested. Alternatively, we can use single brackets but explicitly calling all rows and columns i.e [,i]
for (i in 1:12){
if (is.character(df_trees[,i]) == TRUE) {
df_trees[,i]<-as.factor(df_trees[,i])
}
}
Date column has character data type. We change it to date.
# df_trees$date <- as.Date(df_trees$date,format="%Y-%m-%d") #using base r
df_trees$date <- lubridate::ymd(df_trees$date) #using lubridate
Save the planted year of the tree as a separate column
# df_trees$year <- format(df_trees$date, "%Y") #using base r
df_trees$year <- year(df_trees$date) # using lubridate
class(df_trees$year)
## [1] "numeric"
Take a look at the format of species name in species column.
head(unique(df_trees$species))
## [1] Tree(s) ::
## [2] Pittosporum undulatum :: Victorian Box
## [3] Acacia melanoxylon :: Blackwood Acacia
## [4] Magnolia grandiflora :: Southern Magnolia
## [5] Corymbia ficifolia :: Red Flowering Gum
## [6] Ginkgo biloba :: Maidenhair Tree
## 571 Levels: :: ... Ziziphus jujuba :: Jujube
we see that for each species, the species column reports both the latin and common names. For future analysis, it might be more convenient to split the latin and common names into respective columns.
Separate the species column into latin and common name columns.
df_trees <- df_trees %>%
separate(species, sep = "::", remove = FALSE, into = c('species_lat', 'species_nor'))
Check the percentage of missing values in each column
for (col in colnames(df_trees)){
print(paste(col, "=", mean(is.na(df_trees[[col]])))) #mean takes percentage here because adds up all TRUES (since their value is 1)
#and divides by total number of values (TRUE + FALSE)
}
## [1] "tree_id = 0"
## [1] "legal_status = 0.000279811593527025"
## [1] "species = 0"
## [1] "species_lat = 0"
## [1] "species_nor = 0"
## [1] "address = 0.00770518221434604"
## [1] "site_order = 0.00846689155228072"
## [1] "site_info = 0"
## [1] "caretaker = 0"
## [1] "date = 0.645691160544493"
## [1] "dbh = 0.216693352401975"
## [1] "plot_size = 0.259152170871613"
## [1] "latitude = 0.0146745635716395"
## [1] "longitude = 0.0146745635716395"
## [1] "year = 0.645691160544493"
Column with most significant amount of missing data is the date column.
We see almost every species has missing values (517 out of 571 species) so there is at least no clear indication that the data is missing not at random.
df_trees %>% filter(is.na(date))%>%
select(species)%>%
unique() %>% dim()
## [1] 517 1
Further analysis of missing date values: percentage of observations with missing date values, per species.
df_trees_datena <- df_trees %>% filter(is.na(date)) %>%
count(species)
df_trees_species_count <- df_trees %>% count(species)
df_trees_merge <- dplyr::left_join(df_trees_species_count, df_trees_datena, by="species", suffix = c("total", "date.na"))
df_trees_merge$percent.missing <- df_trees_merge$ndate.na / df_trees_merge$ntotal
df_trees_merge %>%
arrange(desc(ntotal), desc(percent.missing)) %>%
head()
## species ntotal ndate.na
## 1 Tree(s) :: 11629 1563
## 2 Platanus x hispanica :: Sycamore: London Plane 11557 9773
## 3 Metrosideros excelsa :: New Zealand Xmas Tree 8744 6470
## 4 Lophostemon confertus :: Brisbane Box 8581 4481
## 5 Tristaniopsis laurina :: Swamp Myrtle 7197 3682
## 6 Pittosporum undulatum :: Victorian Box 7122 5080
## percent.missing
## 1 0.1344054
## 2 0.8456347
## 3 0.7399360
## 4 0.5222002
## 5 0.5116021
## 6 0.7132828
We look at the total number of observations we would lose if we drop species where missing 100% of date information. And the number is very small relative to total number of observations (n = 192987)
df_trees_merge %>% filter(percent.missing==1) %>% select(ntotal) %>%
sum()
## [1] 731
How the percentage of missing date values per species is distributed. All species that have 100% of date data missing have a very low number of observations (n <=100).
df_trees_merge %>%
ggplot(aes(percent.missing)) + geom_histogram(fill='skyblue4') +
labs(title='Percentage of Missing Date Values per Species') +
xlab('Percentage of missing date') + ylab('Count') +
scale_x_continuous(breaks=seq(0,1, .10)) +theme_minimal()
Therefore, we now filter the cases based on this 100 ntotal threshold to get a cleaner histogram.
df_trees_merge %>% filter(ntotal>100) %>%
ggplot(aes(percent.missing))+geom_histogram(fill='skyblue4') +
labs(title='Percentage of Missing Date Values per Species') +
xlab('Percentage of missing date') + ylab('Count') +
scale_x_continuous(breaks=seq(0,1, .10)) + theme_minimal()
Checking year of plantation range. most of the trees (50%) were planted between 2001 and 2020. However, it is likely that many of the older trees simply were not recorded so they either dont appear in our data or have a missing date value.
df_trees %>%
select(year)%>%
quantile(na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 1955 1995 2001 2008 2020
Create a new column that shows age using the year column to calculate it. Use the more modern approach mutate(). Then use age column to get an overview of the distribution of tree ages via a histogram.
df_trees <- df_trees %>%
mutate(age = year(Sys.Date())-year)
hist(df_trees$age, main = 'Tree Age Distribution', xlab = 'Age', col = 'skyblue4') # using base r
Count the species
df_trees %>%
count(species_lat) %>%
arrange(desc(n)) %>%
top_n(n = 20)
## species_lat n
## 1 Tree(s) 11629
## 2 Platanus x hispanica 11557
## 3 Metrosideros excelsa 8744
## 4 Lophostemon confertus 8581
## 5 Tristaniopsis laurina 7197
## 6 Pittosporum undulatum 7122
## 7 Prunus cerasifera 6716
## 8 Magnolia grandiflora 6285
## 9 Arbutus 'Marina' 5702
## 10 Ficus microcarpa nitida 'Green Gem' 5624
## 11 Prunus serrulata 'Kwanzan' 4025
## 12 Acacia melanoxylon 3956
## 13 Maytenus boaria 3899
## 14 Olea europaea 3694
## 15 Corymbia ficifolia 3575
## 16 Callistemon citrinus 3266
## 17 Ginkgo biloba 3212
## 18 Pyrus calleryana 2969
## 19 Prunus serrulata 2696
## 20 Eriobotrya deflexa 2402
Show number of distinct species in a street
df_trees %>%
filter(address != 'NA') %>%
group_by(address) %>%
summarise(n = n_distinct(species_lat)) %>%
arrange(desc(n)) %>%
top_n(n = 10)
## # A tibble: 13 x 2
## address n
## <fct> <int>
## 1 900x Carolina St 31
## 2 1201 San Jose Ave 30
## 3 1301 Minnesota St 28
## 4 310 Liberty St 26
## 5 1000 San Jose Ave 24
## 6 1100 San Jose Ave 24
## 7 2200 Sunset Blvd 20
## 8 2600 Sunset Blvd 20
## 9 700 Junipero Serra Blvd 20
## 10 1101 San Jose Ave 18
## 11 1200 Sunset Blvd 18
## 12 2100 Sunset Blvd 18
## 13 500x 27th St 18
To do this specific analysis we drop the trees, which have missing date data for obvious reasons. Based on our missing date data analysis in the data cleaning section, we are confident the data does not display any obvious signs of MNAR.
Selected related columns
df_trees_hasdate <- df_trees %>%
filter(!is.na(date)) %>%
select(c("tree_id", "species_lat", "year"))
str(df_trees_hasdate)
## 'data.frame': 68377 obs. of 3 variables:
## $ tree_id : int 53719 30313 30312 30314 30315 30316 48435 30319 30318 30320 ...
## $ species_lat: chr "Tree(s) " "Tree(s) " "Tree(s) " "Pittosporum undulatum " ...
## $ year : num 1955 1955 1955 1955 1955 ...
Species_lat is saved as a factor.
df_trees_hasdate$species_lat <- as.factor(df_trees_hasdate$species_lat)
To make the number of species more manageable, we group them by family names (i.e Acacia, Magnolia)
unique(df_trees_hasdate[grep(df_trees_hasdate$species_lat, pattern = "^Acacia", ignore.case = TRUE), 'species_lat'])
## [1] Acacia melanoxylon Acacia longifolia
## [3] Acacia stenophylla Acacia baileyana
## [5] Acacia baileyana 'Purpurea' Acacia cognata
## [7] Acacia spp Acacia decurrens
## 427 Levels: Acacia baileyana Acacia baileyana 'Purpurea' ... Zelkova serrata 'Village Green'
unique(df_trees_hasdate[grep(df_trees_hasdate$species_lat, pattern = "^Magnolia", ignore.case = TRUE), 'species_lat'])
## [1] Magnolia grandiflora
## [2] Magnolia spp
## [3] Magnolia grandiflora 'Little Gem'
## [4] Magnolia champaca
## [5] Magnolia doltsopa
## [6] Magnolia grandiflora 'Samuel Sommer'
## [7] Magnolia grandiflora 'Saint Mary'
## [8] Magnolia grandiflora 'Russet'
## [9] Magnolia x soulangiana
## [10] Magnolia x foggii 'Jack Fogg'
## [11] Magnolia grandiflora 'Majestic Beauty'
## [12] Magnolia grandiflora 'D.D. Blanchard'
## [13] Magnolia sargentiana 'Robusta'
## [14] Magnolia x alba
## [15] Magnolia doltsopa 'Silvercloud'
## 427 Levels: Acacia baileyana Acacia baileyana 'Purpurea' ... Zelkova serrata 'Village Green'
It seems that specific species can be grouped together with a general name. Since the general name is the first word of the species name, we will separate the species name into its words and then use the first word as the species general name.
Separate the species_lat column into its parts
df_trees_hasdate <- df_trees_hasdate %>%
separate(species_lat, sep = " ", remove = FALSE, into = c('species_first', 'species_second', 'species_third'))
The code below shows that the number of unique species after recategorizing species_lat is 158.
length(unique(df_trees_hasdate$species_first))
## [1] 158
Create a count column to save number of planted trees by species, per year.
df_trees_hasdate_tidy <- df_trees_hasdate %>%
group_by(year) %>%
count(species_first) %>%
arrange(year, desc(n))
As a bit of side note we look at the total number of planted trees per year regardless of species to get a sense of overall tree planting
df_trees_hasdate_tidy %>%
select(c('year', 'n')) %>%
aggregate(by = list(df_trees_hasdate_tidy$year), FUN='sum') %>%
ggplot(aes(x = Group.1, y = n)) +
geom_line() +
labs(title = 'Total Number of Planted Trees per Year',
x = 'Year', y = 'Total number of planted trees') +
theme_minimal()
In this step, We’re going to filter our data set to retain only the top 10 planted species for every given year to keep the visuals understandable. Also we drop the species labelled Tree(s) as this obviously tells us nothing about the species.
df_trees_hasdate_formated <- df_trees_hasdate_tidy %>% subset(species_first != 'Tree(s)') %>%
group_by(year) %>%
mutate(rank = rank(-n)) %>%
filter(rank <=10)
In this step, we will create individual static graphs for each year.
staticplot = ggplot(df_trees_hasdate_formated, aes(rank, group = species_first,
fill = as.factor(species_first), color = as.factor(species_first))) +
geom_tile(aes(y = n/2,
height = n,
width = 0.9), alpha = 0.8, color = NA) +
geom_text(aes(y = 0, label = paste(species_first, " ")), vjust = 0.2, hjust = 1) +
geom_text(aes(y=n, label = n, hjust=0)) +
coord_flip(clip = "off", expand = FALSE) +
scale_y_continuous(labels = scales::comma) +
scale_x_reverse() +
guides(color = FALSE, fill = FALSE) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
legend.position="none",
panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.grid.major.x = element_line( size=.1, color="grey" ),
panel.grid.minor.x = element_line( size=.1, color="grey" ),
plot.title=element_text(size=25, hjust=0.5, face="bold", colour="grey", vjust=-1),
plot.subtitle=element_text(size=18, hjust=0.5, face="italic", color="grey"),
plot.caption =element_text(size=8, hjust=0.5, face="italic", color="grey"),
plot.background=element_blank(),
plot.margin = margin(2,2, 2, 4, "cm"))
anim = staticplot + transition_states(year, transition_length = 4, state_length = 1) +
view_follow(fixed_x = TRUE) +
labs(title = 'Total Number of Planted Trees per Year: {closest_state}',
subtitle = "Top 10 Species")
For gif
animate(anim, 200, fps = 2, width = 1200, height = 1000,
renderer = gifski_renderer("gganim.gif"))
find out which are the top 5 most planted species over entire time period. ‘Trees’ category will be omitted as it tells us nothing about species
df_agg <- aggregate(df_trees_hasdate_tidy$n, by=list(df_trees_hasdate_tidy$species_first), FUN='sum')
df_agg %>%
arrange(desc(x)) %>%
top_n(n=6)
## Group.1 x
## 1 Tree(s) 10066
## 2 Prunus 6393
## 3 Tristaniopsis 4652
## 4 Lophostemon 4100
## 5 Magnolia 3773
## 6 Arbutus 3475
create df to be used for cumulative summation
df_top5 <- df_trees_hasdate_tidy %>%
filter(species_first == 'Prunus' | species_first == 'Tristaniopsis' | species_first == 'Lophostemon' | species_first == 'Magnolia' | species_first =='Arbutus')
Calculate the cumulative summation for each species.
df_top5_cumsum <- df_top5 %>%
group_by(species_first) %>%
mutate(csum = cumsum(n))
Save the data type of year column as Date
df_top5_cumsum$year <- lubridate::ymd(df_top5_cumsum$year, truncated = 2L)
Create the static plot
species_cumsum <- ggplot(df_top5_cumsum, aes(x=year,y=csum, group=species_first, col=factor(species_first))) + geom_line() + scale_color_manual(values = c('#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00')) + theme_minimal()
Create the animated plot with transition
anim2 <- species_cumsum + transition_reveal(year) + view_follow(fixed_x = TRUE) +
labs(title = 'Cumulative Total of Top 5 Species over Time', x = 'Year', y = 'Cumulative Sum')
Animated graph (gif version):
animate(anim2, 200, fps = 5, width = 1200, height = 1000,
renderer = gifski_renderer("gganim2.gif"))
map of recommended species per linked phrase:
recommended_sp <- c(' Japanese Blueberry Tree', ' Flaxleaf Paperbark', ' Red Flowering Gum', ' Flowering Cherry',
' Little Gem Magnolia', ' Southern Magnolia', ' Weeping Bottlebrush', ' Hybrid Strawberry Tree',
' Primrose Tree', ' Brisbane Box', ' Mediterranean Fan Palm', ' Fruitless Olive', ' Chilean Soapbark',
" Small-leaf Tristania 'Elegant'", ' Chinese Pistache', ' Trident Maple', ' Chinese Elm', ' Cork Oak',
' Ginkgo: Autumn Gold', ' Fairmont Ginkgo', ' Ginkgo: Saratoga', ' Autumn Sentinel Ginkgo'
)
Species_nor is saved as a factor.
df_trees$species_nor <- as.factor(df_trees$species_nor)
Recommended species are filtered.
df_trees_recommended <- df_trees %>%
filter(species_nor %in% recommended_sp)
unique(df_trees_recommended$species_nor)
## [1] Southern Magnolia Red Flowering Gum
## [3] Hybrid Strawberry Tree Chinese Elm
## [5] Brisbane Box Weeping Bottlebrush
## [7] Little Gem Magnolia Small-leaf Tristania 'Elegant'
## [9] Chinese Pistache Fruitless Olive
## [11] Japanese Blueberry Tree Trident Maple
## [13] Flaxleaf Paperbark Primrose Tree
## [15] Ginkgo: Autumn Gold Chilean Soapbark
## [17] Cork Oak Mediterranean Fan Palm
## [19] Fairmont Ginkgo Ginkgo: Saratoga
## [21] Autumn Sentinel Ginkgo Flowering Cherry
## 551 Levels: 'Akebono' Cherry Acacia Spp ... Yucca
Get the coordinates of the whole dataset
df_trees_recommended %>%
summarise(max_lon = max(longitude, na.rm=TRUE), min_lon = min(longitude, na.rm=TRUE),
max_lat = max(latitude, na.rm=TRUE), min_lat = min(latitude, na.rm=TRUE))
## max_lon min_lon max_lat min_lat
## 1 -122.3692 -138.2839 47.27022 37.70793
Create a map with leaflet() package
## for all species
## will be added a color for each species.
#map center
mlong = -122.4446
mlat = 37.72
pal <- colorFactor('Paired', domain = recommended_sp)
leaflet(data = df_trees_recommended,
width = '100%',
height = 800,
options = leafletOptions(minZoom = 9, maxZoom = 18)) %>%
addTiles() %>%
setView(lng = mlong, lat = mlat, zoom = 12) %>%
addCircleMarkers(lng = df_trees_recommended$longitude,
lat = df_trees_recommended$latitude,
color = ~pal(df_trees_recommended$species_nor),
popup = df_trees_recommended$species_nor,
label = df_trees_recommended$species_nor,
radius = 4) %>%
addLayersControl(overlayGroups = df_trees_recommended$species_nor) ## add layers control
But map is too chaotic given large number of species…sooo Shiny to the rescue:
Create a shiny app version of the map above, with a check-box to look at each species at a time.
<>
create list of rare species (n = 1) with name matching original df_trees
species_rare <- df_trees_species_count%>%
filter(n == 1) %>%
separate(species, sep = "::", remove = FALSE, into = c('species_lat', 'species_nor'))
Use list of rare species to subset original df_trees and also group them by common first name (i.e family name) to reduce species number
df_trees_rare <- df_trees %>%
filter(species_nor %in% species_rare$species_nor) %>%
separate(species_lat, sep = " ", remove = FALSE, into = c('species_first', 'species_second', 'species_third'))
Plot map using leaflet, to create a hover-over dot pop up with species name, given not enough colors to differentiate so many species
#map center
mlong = -122.4446
mlat = 37.72
rare_species_vec <- unique(df_trees_rare$species_lat)
pal <- colorFactor('Paired', domain = rare_species_vec)
leaflet(data = df_trees_rare,
width = '100%', height = 800,
options = leafletOptions(minZoom = 9, maxZoom = 18)) %>%
addTiles() %>%
setView(lng = mlong, lat = mlat, zoom = 12) %>%
addMarkers(lng = df_trees_rare$longitude,
lat = df_trees_rare$latitude,
icon = list(
iconUrl = 'https://icons.iconarchive.com/icons/matthew-kelleigh/mac-town-vol2/32/Tree-1-icon.png',
#'https://icons.iconarchive.com/icons/pixelkit/flat-jewels/512/Tree-icon.png',
iconSize = c(30, 30)
),
popup = df_trees_rare$species_lat,
label = df_trees_rare$species_lat)
#radius = 4,
#fillOpacity = .5)